/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.tools; import java.io.*; import java.util.*; import java.util.logging.*; import net.nutch.db.*; import net.nutch.util.*; import net.nutch.linkdb.*; import net.nutch.pagedb.*; import net.nutch.pagedb.*; /****************************************** * The WebDBAdminTool is for Nutch administrators * who need special access to the webdb. It allows * for finer editing of the stored values. * * @author Mike Cafarella ******************************************/ public class WebDBAdminTool { public static final Logger LOG = LogFormatter.getLogger("net.nutch.tools.WebDBAdminTool"); IWebDBReader reader; public WebDBAdminTool(IWebDBReader reader) { this.reader = reader; } /** * Emit the webdb to 2 text files. */ public void textDump(String dumpName) throws IOException { // // First the pages // PrintStream out = new PrintStream(new BufferedOutputStream(new FileOutputStream(new File(dumpName + ".pages")))); try { for (Enumeration e = reader.pages(); e.hasMoreElements(); ) { Page p = (Page) e.nextElement(); out.println(p.toTabbedString()); } } finally { out.close(); } // // Then the links // out = new PrintStream(new BufferedOutputStream(new FileOutputStream(new File(dumpName + ".links")))); try { for (Enumeration e = reader.links(); e.hasMoreElements(); ) { Link l = (Link) e.nextElement(); out.println(l.toTabbedString()); } } finally { out.close(); } } /** * Emit the top K-rated Pages. */ public void emitTopK(int k) throws IOException { // Create a sorted list SortedSet topSet = new TreeSet(new Comparator() { public int compare(Object o1, Object o2) { Page p1 = (Page) o1; Page p2 = (Page) o2; if (p1.getScore() < p2.getScore()) { return -1; } else if (p1.getScore() == p2.getScore()) { return 0; } else { return 1; } } } ); // Find the top k elts Page lowestPage = null; for (Enumeration e = reader.pages(); e.hasMoreElements(); ) { Page curPage = (Page) e.nextElement(); if (topSet.size() < k) { topSet.add(curPage); lowestPage = (Page) topSet.first(); } else if (lowestPage.getScore() < curPage.getScore()) { topSet.remove(lowestPage); topSet.add(curPage); lowestPage = (Page) topSet.first(); } } // Print them out int i = 0; for (Iterator it = topSet.iterator(); it.hasNext(); i++) { LOG.info("Page " + i + ": " + (Page) it.next()); } } /** * Emit each page's score and link data */ public void scoreDump() throws IOException { for (Enumeration e = reader.pages(); e.hasMoreElements(); ) { Page p = (Page) e.nextElement(); Link links[] = reader.getLinks(p.getURL()); int numLinks = 0; if (links != null) { numLinks = links.length; } LOG.info(p.getURL() + "\t" + p.getScore() + "\t" + numLinks); } } /** * This tool performs a number of generic db management tasks. * Right now, it only emits the text-format database. */ public static void main(String argv[]) throws FileNotFoundException, IOException { if (argv.length < 2) { System.out.println("Usage: java net.nutch.tools.WebDBAdminTool db [-create] [-textdump dumpPrefix] [-scoredump] [-top k]"); return; } String dir = argv[0]; boolean create = false; int k = 0; String command = null, dumpName = null; for (int i = 1; i < argv.length; i++) { if ("-create".equals(argv[i])) { command = argv[i]; create = true; } else if ("-textdump".equals(argv[i])) { command = argv[i]; i++; dumpName = argv[i]; } else if ("-top".equals(argv[i])) { command = argv[i]; i++; k = Integer.parseInt(argv[i]); } else if ("-scoredump".equals(argv[i])) { command = argv[i]; } } // // For db creation // if ("-create".equals(command)) { WebDBWriter.createWebDB(new File(dir)); LOG.info("Created webdb at " + dir); return; } // // For other functions // IWebDBReader reader = new WebDBReader(new File(dir)); try { WebDBAdminTool admin = new WebDBAdminTool(reader); if ("-textdump".equals(command)) { admin.textDump(dumpName); } else if ("-top".equals(command)) { admin.emitTopK(k); } else if ("-scoredump".equals(command)) { admin.scoreDump(); } } finally { reader.close(); } } }